##Setup Install the correct packages and load the libraries in. If you have not installed the tidyverse package, make sure you uncomment the below lines.

#install.packages("tidyverse") 
#install.packages("lubridate")
library(tidyverse)
library(lubridate)

#load the data
wichita <- read_csv("https://datajournalism.tech/wp-content/uploads/2019/10/wichita.csv")


population <- tibble( subject_race=c("asian/pacific islander", "black","hispanic","other/unknown","white"), num_people =c(19262, 42679, 63659, 13451, 246343))

Data Analysis

Explore the dataset provided by Stanford University. See more on their website https://openpolicing.stanford.edu.

View(wichita) #to view the data table
str(wichita)  #to see the characteristics of variables
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 57750 obs. of  22 variables:
##  $ X1                     : num  1 2 3 4 5 6 7 8 9 10 ...
##  $ raw_row_number         : chr  "923578" "923657" "912091" "923680" ...
##  $ date                   : Date, format: "2016-01-01" "2016-01-01" ...
##  $ time                   : 'hms' num  18:00:00 18:08:00 18:11:00 18:13:00 ...
##   ..- attr(*, "units")= chr "secs"
##  $ location               : chr  "N WEST ST, KS, 67205" "8000 W 13TH ST N, WICHITA, KS, 67212" "500 S LIMUEL ST, WICHITA, KS, 67235" "7600 W 21ST ST N, WICHITA, KS, 67205" ...
##  $ lat                    : num  37.7 37.7 37.7 37.7 37.7 ...
##  $ lng                    : num  -97.4 -97.4 -97.5 -97.4 -97.4 ...
##  $ subject_age            : num  16 44 20 21 28 27 15 20 23 NA ...
##  $ subject_race           : chr  "white" "white" "white" "hispanic" ...
##  $ subject_sex            : chr  "female" "male" "male" "female" ...
##  $ type                   : chr  "vehicular" "vehicular" "vehicular" "vehicular" ...
##  $ disposition            : chr  "DISMISSED" "GUILTY (IVR)" "DISMISSED WITH PREJUDICE; DISMISSED WITH PREJUDICE" "GUILTY" ...
##  $ violation              : chr  "RUN STOP SIGN" "SPEED OVER LIMIT" "DUI; INATTENTIVE DRIVING" "SPEED OVER LIMIT" ...
##  $ citation_issued        : logi  TRUE TRUE TRUE TRUE TRUE TRUE ...
##  $ outcome                : chr  "citation" "citation" "citation" "citation" ...
##  $ posted_speed           : num  NA 40 NA 40 40 40 NA NA NA NA ...
##  $ vehicle_color          : chr  "BURGUNDY OR MAROON" "\"ALUMINUM, SILVER\"" "WHITE" "\"ALUMINUM, SILVER\"" ...
##  $ vehicle_make           : chr  "JEEP (1989 TO PRESENT)" "HYUNDAI" "HONDA" "TOYOTA" ...
##  $ vehicle_model          : chr  NA "TUCSON" NA NA ...
##  $ vehicle_year           : num  2008 NA NA NA NA ...
##  $ raw_defendant_race     : chr  "W" "W" "W" "W" ...
##  $ raw_defendant_ethnicity: chr  "N" "N" "N" "H" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   X1 = col_double(),
##   ..   raw_row_number = col_character(),
##   ..   date = col_date(format = ""),
##   ..   time = col_time(format = ""),
##   ..   location = col_character(),
##   ..   lat = col_double(),
##   ..   lng = col_double(),
##   ..   subject_age = col_double(),
##   ..   subject_race = col_character(),
##   ..   subject_sex = col_character(),
##   ..   type = col_character(),
##   ..   disposition = col_character(),
##   ..   violation = col_character(),
##   ..   citation_issued = col_logical(),
##   ..   outcome = col_character(),
##   ..   posted_speed = col_double(),
##   ..   vehicle_color = col_character(),
##   ..   vehicle_make = col_character(),
##   ..   vehicle_model = col_character(),
##   ..   vehicle_year = col_double(),
##   ..   raw_defendant_race = col_character(),
##   ..   raw_defendant_ethnicity = col_character()
##   .. )
glimpse (wichita) #to see a short summary of values in each column
## Observations: 57,750
## Variables: 22
## $ X1                      <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,...
## $ raw_row_number          <chr> "923578", "923657", "912091", "923680"...
## $ date                    <date> 2016-01-01, 2016-01-01, 2016-01-01, 2...
## $ time                    <time> 18:00:00, 18:08:00, 18:11:00, 18:13:0...
## $ location                <chr> "N WEST ST, KS, 67205", "8000 W 13TH S...
## $ lat                     <dbl> 37.74143, 37.70880, 37.67482, 37.72402...
## $ lng                     <dbl> -97.38976, -97.44059, -97.48999, -97.4...
## $ subject_age             <dbl> 16, 44, 20, 21, 28, 27, 15, 20, 23, NA...
## $ subject_race            <chr> "white", "white", "white", "hispanic",...
## $ subject_sex             <chr> "female", "male", "male", "female", "m...
## $ type                    <chr> "vehicular", "vehicular", "vehicular",...
## $ disposition             <chr> "DISMISSED", "GUILTY (IVR)", "DISMISSE...
## $ violation               <chr> "RUN STOP SIGN", "SPEED OVER LIMIT", "...
## $ citation_issued         <lgl> TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TR...
## $ outcome                 <chr> "citation", "citation", "citation", "c...
## $ posted_speed            <dbl> NA, 40, NA, 40, 40, 40, NA, NA, NA, NA...
## $ vehicle_color           <chr> "BURGUNDY OR MAROON", "\"ALUMINUM, SIL...
## $ vehicle_make            <chr> "JEEP (1989 TO PRESENT)", "HYUNDAI", "...
## $ vehicle_model           <chr> NA, "TUCSON", NA, NA, "SILVERADO", "NE...
## $ vehicle_year            <dbl> 2008, NA, NA, NA, NA, NA, NA, 2008, 20...
## $ raw_defendant_race      <chr> "W", "W", "W", "W", "W", "W", "W", "W"...
## $ raw_defendant_ethnicity <chr> "N", "N", "N", "H", "H", "N", "H", "H"...
colnames(wichita) #to view column headers
##  [1] "X1"                      "raw_row_number"         
##  [3] "date"                    "time"                   
##  [5] "location"                "lat"                    
##  [7] "lng"                     "subject_age"            
##  [9] "subject_race"            "subject_sex"            
## [11] "type"                    "disposition"            
## [13] "violation"               "citation_issued"        
## [15] "outcome"                 "posted_speed"           
## [17] "vehicle_color"           "vehicle_make"           
## [19] "vehicle_model"           "vehicle_year"           
## [21] "raw_defendant_race"      "raw_defendant_ethnicity"

After viewing the dataset, you can analyze it to see the min, max, mean, median and other values for each variable. These are called descriptive statistics.

summary(wichita)
##        X1        raw_row_number          date                time         
##  Min.   :    1   Length:57750       Min.   :2016-01-01   Length:57750     
##  1st Qu.:14438   Class :character   1st Qu.:2016-03-16   Class1:hms       
##  Median :28876   Mode  :character   Median :2016-05-29   Class2:difftime  
##  Mean   :28876                      Mean   :2016-06-10   Mode  :numeric   
##  3rd Qu.:43313                      3rd Qu.:2016-08-31                    
##  Max.   :57750                      Max.   :2016-12-31                    
##                                                                           
##    location              lat             lng           subject_age   
##  Length:57750       Min.   :37.47   Min.   :-101.36   Min.   :11.00  
##  Class :character   1st Qu.:37.67   1st Qu.: -97.37   1st Qu.:24.00  
##  Mode  :character   Median :37.69   Median : -97.34   Median :33.00  
##                     Mean   :37.69   Mean   : -97.33   Mean   :36.71  
##                     3rd Qu.:37.70   3rd Qu.: -97.28   3rd Qu.:48.00  
##                     Max.   :38.48   Max.   : -96.75   Max.   :99.00  
##                     NA's   :1167    NA's   :1167      NA's   :10128  
##  subject_race       subject_sex            type          
##  Length:57750       Length:57750       Length:57750      
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##                                                          
##  disposition         violation         citation_issued   outcome         
##  Length:57750       Length:57750       Mode:logical    Length:57750      
##  Class :character   Class :character   TRUE:57750      Class :character  
##  Mode  :character   Mode  :character                   Mode  :character  
##                                                                          
##                                                                          
##                                                                          
##                                                                          
##   posted_speed    vehicle_color      vehicle_make       vehicle_model     
##  Min.   : 20.00   Length:57750       Length:57750       Length:57750      
##  1st Qu.: 30.00   Class :character   Class :character   Class :character  
##  Median : 40.00   Mode  :character   Mode  :character   Mode  :character  
##  Mean   : 39.93                                                           
##  3rd Qu.: 40.00                                                           
##  Max.   :304.00                                                           
##  NA's   :35149                                                            
##   vehicle_year   raw_defendant_race raw_defendant_ethnicity
##  Min.   :1962    Length:57750       Length:57750           
##  1st Qu.:2001    Class :character   Class :character       
##  Median :2005    Mode  :character   Mode  :character       
##  Mean   :2005                                              
##  3rd Qu.:2009                                              
##  Max.   :2999                                              
##  NA's   :43236

There are some verbs that you need to memorize. See more at https://learn.r-journalism.com/en/wrangling/dplyr/dplyr/ First, the select verb helps you grab column(s) in a dataset

race<- select(wichita,subject_race) #to select the subject_race columns

Second, the group_by verb helps you categorize your values into fewer groups. The summarize verb always goes along with the group_by to help count the number of values for each group and compute the percentage of each group over the whole population.

race <- group_by(race, subject_race) %>% summarize(value=n(),  prop=value/nrow(.))

View(race) # to view the `race` table
stops<-left_join(race, population, by = "subject_race")
stops <- mutate(stops,stop_rate=value/num_people)

##Data Visualization We will need certain packages to be installed and called before creating our charts.

###Bar Chart

#make the plot

bar <- ggplot(stops,
       aes(x=reorder(subject_race,stop_rate), y=stop_rate))+
  geom_bar(stat="identity", 
           position="identity", 
           fill="yellow")+
  geom_hline(yintercept = 0) +
  labs(title="Stopped Drivers by Race",
       subtitle = "African American drivers got stopped the most in the city of Wichita Kansas")+
  coord_flip()

options(scipen=10000)

bar

###Interactive Map with leaflet

#install.packages("httpuv")
#install.packages("leaflet")
library(httpuv)
library(leaflet)
m <- leaflet() %>% 
  addTiles() %>% 
  setView(lng=  -97.317163, lat= 37.685327, zoom=16) %>% 
  addMarkers(lng= -97.317163, lat= 37.685327,  popup="Wichita, KS")

m
race <- colorFactor(c("coral1", "black", "yellow", "darkolivegreen", "darkgrey"), domain=c("white", "black", "asian/pacific islander", "hispanic", "other/unknown"), ordered=TRUE)
m2<- leaflet(wichita) %>%
  addProviderTiles(providers$OpenStreetMap) %>% 
  setView(lng=-97.31716337, lat= 37.685327, zoom=11) %>% 
  addCircleMarkers(~lng, ~lat, popup=paste("This is a", wichita$subject_race, "and", wichita$subject_sex, "driver."), weight= 1, radius=2, color=~race(subject_race), stroke=F, fillOpacity=1)
## Warning in validateCoords(lng, lat, funcName): Data contains 1167 rows with
## either missing or invalid lat/lon values and will be ignored
m2